Spatial Functions#

Data source : https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page

from sqlalchemy import create_engine
import geopandas as gpd
import requests
import shutil
import folium
import json
from pathlib import Path
engine_str = "postgresql+psycopg2://docker:docker@0.0.0.0:25432/restaurants"
engine = create_engine(engine_str)
%load_ext sql
%sql $engine.url
'Connected: docker@restaurants'

Download MapPluto#

def download_file(url: str, local_path: Path):
    with requests.get(url, stream=True) as r:
        with open(local_path, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

    return local_path
url = 'https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyc_mappluto_22v2_fgdb.zip'

local_filename = url.split('/')[-1]
downloads = Path('../data/tmp/')
local_path = downloads.joinpath(local_filename)

if not downloads.exists():
    downloads.mkdir(parents=True)

# guard against unnecessary re-downloading
if not local_path.exists():
    download_file(url, local_path)
gdb = 'MapPLUTO22v2.gdb'
layer = 'MapPLUTO_22v2_clipped'

We can actually access python variables in the bash kernel. Lets use it!

%%bash -s "$local_path" "$gdb"
echo $1 $2
../data/tmp/nyc_mappluto_22v2_fgdb.zip MapPLUTO22v2.gdb
%%bash -s "$local_path" "$gdb" "$layer"

gdb="../data/tmp/$2"
if [ ! -d $gdb ]; then
    unzip -o $1 -d ../data/tmp/ > /dev/null
fi

# extract relevant info
echo $(gdalsrsinfo -e $gdb | grep "^EPSG:")
ogrinfo -so $gdb $3 | grep 'Geometry\|Feature Count\|Extent'

# import do db
ogr2ogr -f "PostgreSQL" \
 PG:"host='0.0.0.0' port='25432' user='docker' password='docker' dbname='restaurants'" $gdb $3 \
 -nlt PROMOTE_TO_MULTI \
 -nln "mappluto" \
 -lco GEOMETRY_NAME=geom \
 -overwrite 
EPSG:2263
Geometry: Multi Polygon
Feature Count: 857020
Extent: (913128.926363, 120048.985951) - (1067335.951282, 272811.183060)
Geometry Column = Shape
%%sql 
select column_name, data_type from information_schema.columns
where table_schema = 'public'
and table_name = 'mappluto'
LIMIT 5;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
5 rows affected.
column_name data_type
objectid integer
borough character varying
block integer
lot smallint
cd smallint

Queens is the only borough that matters. We will create a separate mappluto_queens table for this demo.

%sql SELECT DISTINCT borough FROM mappluto;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
5 rows affected.
borough
QN
BK
SI
MN
BX
%%sql
DROP TABLE IF EXISTS mappluto_nyc;
DROP TABLE IF EXISTS mappluto_queens;

CREATE TABLE mappluto_nyc as SELECT objectid, borough, zipcode, ownertype, yearbuilt, assesstot, geom FROM mappluto;
CREATE TABLE mappluto_queens as SELECT * FROM mappluto_nyc WHERE borough='QN';
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
Done.
Done.
857020 rows affected.
324248 rows affected.
[]

Spatial Data Quality Check#

Create indexes on newly created tables

%%sql
CREATE INDEX IF NOT EXISTS mappluto_nyc_geom_idx ON mappluto_nyc USING gist(geom);
CREATE INDEX IF NOT EXISTS mappluto_queens_geom_idx ON mappluto_queens USING gist(geom);
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
Done.
Done.
[]

We will be using the new york counties table we created in the last exercise. Let’s examine the SRIDs we have.

%sql SELECT  state_name, ST_SRID(geom) as geom_srid, ST_SRID(geom_utm18n) as geom_utm18n_srid FROM cb_2020_ny_county_500k LIMIT 1;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1 rows affected.
state_name geom_srid geom_utm18n_srid
New York 4269 3725
%sql SELECT ST_SRID(geom)  as geom_srid FROM mappluto_queens LIMIT 1;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1 rows affected.
geom_srid
2263

Check if geometries simple/valid

%%sql
SELECT *
FROM mappluto_queens 
WHERE ST_IsValid(geom) is not true
LIMIT 100;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
0 rows affected.
objectid borough zipcode ownertype yearbuilt assesstot geom
%%sql
SELECT *
FROM mappluto_queens 
WHERE ST_IsSimple(geom) is not true
LIMIT 100;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
0 rows affected.
objectid borough zipcode ownertype yearbuilt assesstot geom
%%sql
SELECT *
FROM cb_2020_ny_county_500k
WHERE ST_IsValid(geom) is not true
LIMIT 100;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
0 rows affected.
ogc_fid statefp countyfp countyns affgeoid geoid name namelsad stusps state_name lsad aland awater geom geom_utm18n
%%sql
SELECT *
FROM cb_2020_ny_county_500k
WHERE ST_IsSimple(geom) is not true
LIMIT 100;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
0 rows affected.
ogc_fid statefp countyfp countyns affgeoid geoid name namelsad stusps state_name lsad aland awater geom geom_utm18n

Visual Verification of SRID assignments#

We export the tables to GeoJSON for creating leaflet web maps via folium

%%sql pluto_sample <<
 SELECT ST_AsGeoJSON(subq) as geojson 
 FROM (
   SELECT 
    objectid, 
    zipcode, 
    ownertype,
    yearbuilt,
    assesstot,
    ST_Transform(geom, 4326) as geom 
    FROM mappluto_queens ORDER BY assesstot DESC LIMIT 1000
 ) as subq;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1000 rows affected.
Returning data to local variable pluto_sample
%%sql ny_county <<
SELECT ST_AsGeoJSON(subq.*) as geojson 
FROM (
    SELECT     
        ogc_fid,
        statefp,
        countyfp,
        countyns,
        affgeoid,
        geoid,
        name,
        namelsad,
        stusps,
        state_name,
        lsad	,
        aland,
        awater,
        ST_Transform(geom, 4326) as geom
    FROM cb_2020_ny_county_500k
    ) as subq;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
62 rows affected.
Returning data to local variable ny_county
QUEENS = [40.7292, -73.9049]

def to_map(result_set, location=QUEENS, zoom=13, fillcolor='orange'):
    m = folium.Map(location=location, zoom_start=zoom)

    for _, r in result_set.DataFrame().iterrows():
        geojson = json.loads(r.geojson)

        if 'properties' in geojson.keys():
            properties = geojson['properties']
        else:
            properties = None

        popup_cols = [c for c in properties.keys() if 'geom' not in c.lower()] if properties is not None else []
        geo_j = folium.GeoJson(data=geojson,
                                style_function=lambda x: {'fillColor': fillcolor},
                                popup=folium.GeoJsonPopup(popup_cols)
        )
        geo_j.add_to(m)

    return m

to_map(pluto_sample)
Make this Notebook Trusted to load map: File -> Trust Notebook
to_map(ny_county, zoom = 10, fillcolor='purple')
Make this Notebook Trusted to load map: File -> Trust Notebook

Non-Spatial Data Quality Check#

%sql SELECT yearbuilt FROM  mappluto_queens WHERE yearbuilt < 1800 ORDER BY yearbuilt DESC LIMIT 10;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
10 rows affected.
yearbuilt
1785
1775
1735
1729
1706
1694
1661
0
0
0

yearbuilt=0 should be a null value.

%%sql
UPDATE mappluto SET yearbuilt=NULL WHERE yearbuilt=0;
UPDATE mappluto_nyc SET yearbuilt=NULL WHERE yearbuilt=0;
UPDATE mappluto_queens SET yearbuilt = NULL WHERE yearbuilt=0;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
40240 rows affected.
40240 rows affected.
12047 rows affected.
[]
%sql SELECT * FROM mappluto_queens WHERE zipcode <= 0;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
0 rows affected.
objectid borough zipcode ownertype yearbuilt assesstot geom
%%sql
SELECT COUNT(*) FROM mappluto_queens WHERE assesstot <= 0;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1 rows affected.
count
1220
assesstot_low = %sql SELECT ST_AsGeoJSON(subq.*) as geojson FROM (SELECT ownertype, assesstot, ST_Transform(geom, 4326) as geom FROM mappluto_queens WHERE assesstot <=0) as subq;
to_map(assesstot_low, fillcolor='red')
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1220 rows affected.
Make this Notebook Trusted to load map: File -> Trust Notebook

It looks like many of the assesstot=0 examples are public property. I’m not sure these should be considered NULL, so we’ll leave them and account for it in our aggregations.

Spatial Calculations#

The mappluto geom column is in EPSG:2263, a projection in US feet.

%%sql 
SELECT 
objectid, 
ROUND(ST_Area(geom)::numeric,2) as area_ft2, 
ROUND(ST_Perimeter(geom)::numeric,2) as perimeter_ft,
DATE_PART('year', AGE(CURRENT_DATE, TO_DATE(yearbuilt::varchar(4), 'YYYY')))::int as age_years
FROM mappluto_queens
LIMIT 10;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
10 rows affected.
objectid area_ft2 perimeter_ft age_years
375694 2556.95 256.83 None
375486 1648.16 236.40 151
375487 1752.24 238.35 151
375489 1743.10 238.08 151
375493 967.43 127.48 112
375495 2489.58 252.48 112
375496 2678.73 256.20 102
375498 2431.73 253.78 51
375500 12585.91 452.97 92
375501 2758.56 258.62 91
%%sql 
SELECT 
    zipcode,
    ROUND(AVG(ST_Area(geom))::numeric,2) as mean_area_ft2, 
    ROUND(AVG(assesstot)::numeric,2) as mean_assesstot_dollar
FROM mappluto_queens
-- edge case guards
WHERE assesstot > 0 AND zipcode is NOT NULL
GROUP BY zipcode
ORDER BY mean_area_ft2 DESC
LIMIT 10;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
10 rows affected.
zipcode mean_area_ft2 mean_assesstot_dollar
11371 11776044.15 1925443350.00
11359 3111281.36 33181770.00
11697 1637333.54 5959041.85
11695 1547725.00 31394250.00
11430 799130.72 11205355.00
11005 687929.59 24933921.43
11109 35344.61 37012320.50
11422 30900.67 1169964.75
11451 29977.68 411300.00
11693 23625.09 176013.70
%%sql 
WITH subq AS (
    SELECT 
    zipcode,
    DATE_PART('year', AGE(CURRENT_DATE, TO_DATE(yearbuilt::varchar(4), 'YYYY')))::int as age_years
    FROM mappluto_queens
)
SELECT zipcode, mode() WITHIN GROUP (ORDER BY age_years) as age_mode
FROM subq
GROUP BY zipcode
-- ensure at least one non-null age
HAVING SUM(CASE WHEN age_years IS NULL then 0 else 1 end) >= 1
LIMIT 10;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
10 rows affected.
zipcode age_mode
11001 67
11004 72
11005 50
11040 72
11101 91
11102 92
11103 97
11104 95
11105 92
11106 92
%%sql
WITH subq AS (
    SELECT 
        zipcode,
        ROUND(ST_Area(geom)::numeric, 2) as area_ft2,
        DATE_PART('year', AGE(CURRENT_DATE, TO_DATE(yearbuilt::varchar(4), 'YYYY')))::int as age_years
    FROM mappluto_queens
)
SELECT 
    zipcode,
    percentile_disc(0.5) WITHIN GROUP (ORDER BY area_ft2) AS median_area_ft2,
    percentile_disc(0.5) WITHIN GROUP (ORDER BY age_years) AS median_age
FROM subq
GROUP BY zipcode
-- ensure at least one non-null age
HAVING SUM(CASE WHEN age_years IS NULL then 0 else 1 end) >= 1
LIMIT 10;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
10 rows affected.
zipcode median_area_ft2 median_age
11001 3975.97 72
11004 4028.26 72
11005 100726.73 50
11040 4264.12 72
11101 3942.21 91
11102 2536.56 92
11103 2478.47 94
11104 2207.72 93
11105 2324.60 92
11106 2472.45 92

Simplification#

ST_Simplify returns a “simplified” geometry using Douglas-Peucker algorithm. Topology is not preserved.

tolerance = 0.001

%%sql simp001 <<
WITH subq AS (
    SELECT 
    name,
    ST_Simplify(ST_Transform(geom, 4326), 0.001) as geom
    FROM cb_2020_ny_county_500k
)
SELECT ST_AsGeoJSON(subq.*) as geojson
FROM subq
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
62 rows affected.
Returning data to local variable simp001
to_map(simp001, zoom=12, fillcolor='green')
Make this Notebook Trusted to load map: File -> Trust Notebook

tolerance = 0.01

%%sql simp01 <<
WITH subq AS (
    SELECT 
    name,
    ST_Simplify(ST_Transform(geom, 4326), 0.01) as geom
    FROM cb_2020_ny_county_500k
)
SELECT ST_AsGeoJSON(subq.*) as geojson
FROM subq
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
62 rows affected.
Returning data to local variable simp01
to_map(simp01, zoom=12, fillcolor='green')
Make this Notebook Trusted to load map: File -> Trust Notebook